In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load dataset
df = pd.read_csv("C:/Users/DELL/OneDrive/Desktop/cohort 4/Python/loan_approval_dataset.csv")

# Clean column names
df.columns = df.columns.str.strip()

# Copy the dataset
df_encoded = df.copy()

# Clean and encode 'loan_status'
df_encoded['loan_status'] = df['loan_status'].str.strip().map({
    'Approved': 1,
    'Rejected': 0
})

# Encode categorical features
le = LabelEncoder()
df_encoded['education'] = le.fit_transform(df_encoded['education'].str.strip())
df_encoded['self_employed'] = le.fit_transform(df_encoded['self_employed'].str.strip())

# Feature engineering
df_encoded['debt_income_ratio'] = df_encoded['loan_amount'] / df_encoded['income_annum']
df_encoded['monthly_emi'] = df_encoded['loan_amount'] / df_encoded['loan_term']

# Define X and y
X = df_encoded.drop(['loan_id', 'loan_status'], axis=1)
y = df_encoded['loan_status']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Train Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predict
y_pred = rf_model.predict(X_test)

# Evaluate
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n📄 Classification Report:\n", classification_report(y_test, y_pred))
✅ Accuracy: 0.9988290398126464

📊 Confusion Matrix:
 [[317   1]
 [  0 536]]

📄 Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       318
           1       1.00      1.00      1.00       536

    accuracy                           1.00       854
   macro avg       1.00      1.00      1.00       854
weighted avg       1.00      1.00      1.00       854

In [2]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(rf_model, X, y, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())
Cross-validation scores: [0.9941452  0.99531616 0.99765808 0.99648712 0.99765533]
Mean CV accuracy: 0.9962523782983876
In [3]:
import matplotlib.pyplot as plt  # Add this line if not already imported

importances = pd.Series(rf_model.feature_importances_, index=X.columns)
importances.nlargest(10).plot(kind='barh')
plt.title("Top 10 Features Affecting Loan Approval")
plt.xlabel("Feature Importance")
plt.show()
No description has been provided for this image
In [4]:
!pip install plotly
Requirement already satisfied: plotly in c:\users\dell\appdata\local\programs\python\python313\lib\site-packages (6.2.0)
Requirement already satisfied: narwhals>=1.15.1 in c:\users\dell\appdata\local\programs\python\python313\lib\site-packages (from plotly) (1.46.0)
Requirement already satisfied: packaging in c:\users\dell\appdata\local\programs\python\python313\lib\site-packages (from plotly) (25.0)
In [5]:
import plotly.io as pio
pio.renderers.default = 'notebook'  # For classic Jupyter Notebook
In [6]:
import plotly.express as px

fig = px.histogram(df, x="income_annum", color="loan_status",
                   title="Applicant Income vs Loan Approval Status")

fig.show()
In [8]:
import plotly
print(plotly.__version__)
6.2.0
In [10]:
import plotly.graph_objects as go

fig = go.Figure(data=go.Bar(y=[2, 3, 1]))
fig.show()
In [ ]: